#install.packages(c("forecast", "expsmooth", "seasonal")) 
library(TTR)
library(forecast) 
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(tseries) 
library(expsmooth) 
library(fpp2)
## ── Attaching packages ────────────────────────────────────────────── fpp2 2.5 ──
## ✔ ggplot2 3.4.2     ✔ fma     2.5
## 
library(seasonal)
library(MASS)
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol
library(stats)
library(TSA)
## Registered S3 methods overwritten by 'TSA':
##   method       from    
##   fitted.Arima forecast
##   plot.Arima   forecast
## 
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
## 
##     acf, arima
## The following object is masked from 'package:utils':
## 
##     tar
library(forecast)
library(ggplot2)
library(tseries)
library(imputeTS)
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
## 
##     na.remove
library(vars)
## Loading required package: strucchange
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following object is masked from 'package:imputeTS':
## 
##     na.locf
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: urca
## Loading required package: lmtest
library(timetk)
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(lmtest)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(xts)
library(dplyr)
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite

Step 1: Load data and plot the time series

path <- "/Users/aashishsingh/Desktop/Time Series - Final Project/"
pm2_5_data <- read.csv(paste0(path, "la_pm2_5_pollutant_data.csv"), 
                     na.strings=c("", "NA"))
ozone_data <- read.csv(paste0(path, "la_ozone_pollutant_data.csv"), 
                     na.strings=c("", "NA"))

# Convert data into ts objects
pm2_5_ts <- ts(pm2_5_data$PM2.5.AQI.Value, start = c(1999,1,1), frequency = 365.25)
ozone_ts <- ts(ozone_data$Ozone.AQI.Value, start = c(1999,1,1), frequency = 365.25)

plot(pm2_5_ts, main="Los Angeles PM 2.5 AQI")

plot(ozone_ts, main="Los Angeles Ozone AQI")

Step 2: Check if data is stationary (KPSS/ADF Test)

kpss.test(pm2_5_ts)
## Warning in kpss.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_ts
## KPSS Level = 6.951, Truncation lag parameter = 12, p-value = 0.01
# The reported p-value is 0.01, which is smaller than 0.05, and would suggest 
# that we reject the null hypothesis of level stationarity and conclude that 
# there is evidence that the data is non-stationary

adf.test(pm2_5_ts)
## Warning in adf.test(pm2_5_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_ts
## Dickey-Fuller = -14.554, Lag order = 20, p-value = 0.01
## alternative hypothesis: stationary
# Similarly the Augumented Dickey-Fuller test results in p-value of 0.01 (<0.05) 
# where we do reject the null hypothesis that the time series is non-stationary
# and thus data is stationary.

# These are opposite results so we use ACF PACF plots to see if the series is stationary
acf(pm2_5_ts, main = "ACF of Original Time Series")

pacf(pm2_5_ts, main = "PACF of Original Time Series")

#### Step 3: Prepare data for Weekly & Monthly extraction and EDA

# Create data with Dates
pm2_5_df <- as.data.frame(pm2_5_ts)
pm2_5_df$Date <- mdy(pm2_5_data$Date)
colnames(pm2_5_df) <- c("PM2.5.AQI.Value", "Date")

# Convert to xts format for weekly & monthly
pm2_5_xts <- as.xts(pm2_5_df, order.by=pm2_5_df$Date)
pm2_5_xts <- pm2_5_xts[,-2]

Step 4: Understand PM2.5 AQI variance across years

# Let's see how seasonality looks over time and is the variance changing
pm2_5_df$Month <- month(pm2_5_df$Date)
pm2_5_df$Year <- year(pm2_5_df$Date)

avg_aqi_by_month_year <- pm2_5_df %>%
  group_by(pm2_5_df$Year, pm2_5_df$Month) %>%
  summarise(
    avg_value = mean(PM2.5.AQI.Value)
  )
## `summarise()` has grouped output by 'pm2_5_df$Year'. You can override using the
## `.groups` argument.
colnames(avg_aqi_by_month_year) <- c("Year", "Month", "avg_value")

reshape_avg_aqi_by_month_year <- 
  sqldf(
    "SELECT 
      Month,
      MAX(CASE WHEN Year = 1999 THEN avg_value END) AS Year_1999,
      MAX(CASE WHEN Year = 2000 THEN avg_value END) AS Year_2000,
      MAX(CASE WHEN Year = 2001 THEN avg_value END) AS Year_2001,
      MAX(CASE WHEN Year = 2002 THEN avg_value END) AS Year_2002,
      MAX(CASE WHEN Year = 2003 THEN avg_value END) AS Year_2003,
      MAX(CASE WHEN Year = 2004 THEN avg_value END) AS Year_2004,
      MAX(CASE WHEN Year = 2005 THEN avg_value END) AS Year_2005,
      MAX(CASE WHEN Year = 2006 THEN avg_value END) AS Year_2006,
      MAX(CASE WHEN Year = 2007 THEN avg_value END) AS Year_2007,
      MAX(CASE WHEN Year = 2008 THEN avg_value END) AS Year_2008,
      MAX(CASE WHEN Year = 2009 THEN avg_value END) AS Year_2009,
      MAX(CASE WHEN Year = 2010 THEN avg_value END) AS Year_2010,
      MAX(CASE WHEN Year = 2011 THEN avg_value END) AS Year_2011,
      MAX(CASE WHEN Year = 2012 THEN avg_value END) AS Year_2012,
      MAX(CASE WHEN Year = 2013 THEN avg_value END) AS Year_2013,
      MAX(CASE WHEN Year = 2014 THEN avg_value END) AS Year_2014,
      MAX(CASE WHEN Year = 2015 THEN avg_value END) AS Year_2015,
      MAX(CASE WHEN Year = 2016 THEN avg_value END) AS Year_2016,
      MAX(CASE WHEN Year = 2017 THEN avg_value END) AS Year_2017,
      MAX(CASE WHEN Year = 2018 THEN avg_value END) AS Year_2018,
      MAX(CASE WHEN Year = 2019 THEN avg_value END) AS Year_2019,
      MAX(CASE WHEN Year = 2020 THEN avg_value END) AS Year_2020,
      MAX(CASE WHEN Year = 2021 THEN avg_value END) AS Year_2021,
      MAX(CASE WHEN Year = 2022 THEN avg_value END) AS Year_2022,
      MAX(CASE WHEN Year = 2023 THEN avg_value END) AS Year_2023
    FROM avg_aqi_by_month_year
    GROUP BY Month
    ORDER BY Month"
  )

colnames(reshape_avg_aqi_by_month_year) <- c(
  "Month", "1999", "2000", "2001", "2002", "2003", "2004", "2005", "2006", 
  "2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", 
  "2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023")

boxplot(reshape_avg_aqi_by_month_year[2:26])

# We can see that over the years there is downward trend and variance is decreasing
# except 2020 when we see a high peak likely caused by wildfires
# Link: https://en.wikipedia.org/wiki/Lake_Fire_(2020)

Step 5a: Extract values for time series and plot the series - Weekly

pm2_5_weekly <- apply.weekly(pm2_5_xts, mean)
pm2_5_weekly_ts <- ts(pm2_5_weekly["19990103/20230811"], 
                      start = c(1999,1),
                      frequency = 52.18)
plot(pm2_5_weekly_ts)

# Strong seasonality, Strong cyclic, light downward trend, variance is reducing

kpss.test(pm2_5_weekly_ts)
## Warning in kpss.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_weekly_ts
## KPSS Level = 3.4904, Truncation lag parameter = 7, p-value = 0.01
adf.test(pm2_5_weekly_ts)
## Warning in adf.test(pm2_5_weekly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_weekly_ts
## Dickey-Fuller = -8.4636, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_weekly_ts, main = "ACF of Weekly Time Series")

pacf(pm2_5_weekly_ts, main = "PACF of Weekly Time Series")

Step 6a: Split data into train and test - Weekly

# Split the data into a training and test dataset
train_weekly <- window(pm2_5_weekly_ts, start = c(1999,1), end=c(2020,52))
test_weekly <- window(pm2_5_weekly_ts, start = c(2021,1))

Step 7a: Decompose the time series plot - Weekly

# Looking at spectral analysis
periodogram(train_weekly, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.06))

# There are two trends: 
#  1) A typical yearly (52 weeks) seasonal trend
#  2) A trend that is repeating every 9.6 years (500 weeks)
#  3) A typical half yearly (26 weeks) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train_weekly, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.


# Understand the seasonality and remove it to see the trend
train_weekly_diff <- diff(train_weekly, lag = 52)
autoplot(train_weekly_diff, ylab="Train Seasonal Differencing - Weekly Data")

# Let's look if the series is stationary?
kpss.test(train_weekly_diff)
## Warning in kpss.test(train_weekly_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_weekly_diff
## KPSS Level = 0.17702, Truncation lag parameter = 7, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_weekly_diff)
## Warning in adf.test(train_weekly_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_weekly_diff
## Dickey-Fuller = -8.5534, Lag order = 10, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_weekly_diff, main = "ACF of Seasonal Differencing Time Series - Weekly Data")

pacf(train_weekly_diff, main = "PACF of Seasonal Differencing Time Series - Weekly Data")

Step 8a: Benchmark using snaive model - Weekly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_weekly <- snaive(train_weekly, h=110)
## Warning in lag.default(y, -lag): 'k' is not an integer
# Plot the forecasts for snaive model
plot(forecast_snaive_weekly, main = "PM 2.5 AQI Forecast - SNaive (Weekly)",
         xlab = "Week", ylab = "PM 2.5 AQI")
lines(test_weekly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)/test_weekly))
mape_snaive_weekly
## [1] 0.2749545
# Mean Absolute Error (MAE)
mae_snaive_weekly <- mean(abs((test_weekly - forecast_snaive_weekly$mean)))
mae_snaive_weekly
## [1] 16.28571
# Mean Squared Error (MSE)
mse_snaive_weekly <- mean((test_weekly - forecast_snaive_weekly$mean)^2)
mse_snaive_weekly
## [1] 599.6278
# Evaluate the residuals
checkresiduals(forecast_snaive_weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 509.98, df = 104.36, p-value < 2.2e-16
## 
## Model df: 0.   Total lags used: 104.36

Step 9a: Forecast using ETS - Weekly

# Build the exponential smoothing model
model_ets_weekly <- ets(train_weekly, lambda="auto", additive.only=TRUE)
## Warning in ets(train_weekly, lambda = "auto", additive.only = TRUE): I can't
## handle data with frequency greater than 24. Seasonality will be ignored. Try
## stlf() if you need seasonal forecasts.
summary(model_ets_weekly)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = train_weekly, additive.only = TRUE, lambda = "auto") 
## 
##   Box-Cox transformation: lambda= -0.5881 
## 
##   Smoothing parameters:
##     alpha = 0.2953 
## 
##   Initial states:
##     l = 1.5899 
## 
##   sigma:  0.0179
## 
##       AIC      AICc       BIC 
## -1146.438 -1146.417 -1131.304 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE    MAPE      MASE      ACF1
## Training set 1.821839 16.44366 11.86525 -1.428602 16.4125 0.7341754 0.1189196
forecast_ets_weekly <- forecast(model_ets_weekly, h=110)

# Plot the forecasts for ETS model
plot(forecast_ets_weekly, main = "PM 2.5 AQI Forecast - ETS",
         xlab = "Month", ylab = "PM 2.5 AQI")
lines(test_weekly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_ets_weekly <- mean(abs((test_weekly - forecast_ets_weekly$mean)/test_weekly))
mape_ets_weekly
## [1] 0.3342543
# Mean Absolute Error (MAE)
mae_ets_weekly <- mean(abs((test_weekly - forecast_ets_weekly$mean)))
mae_ets_weekly
## [1] 19.10983
# Mean Squared Error (MSE)
mse_ets_weekly <- mean((test_weekly - forecast_ets_weekly$mean)^2)
mse_ets_weekly
## [1] 434.2529
# Evaluate the residuals
checkresiduals(forecast_ets_weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 202.74, df = 104.36, p-value = 2.727e-08
## 
## Model df: 0.   Total lags used: 104.36
Box.test(residuals(forecast_ets_weekly), lag = 52, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(forecast_ets_weekly)
## X-squared = 103.65, df = 52, p-value = 2.742e-05

Step 10a: Forecast using Neural Network - Monthly

model_nn_weekly <- nnetar(train_weekly,
                           size=5, 
                           decay=0.5,
                           repeats=1000, 
                           lambda="auto")
summary(model_nn_weekly)
##           Length Class        Mode     
## x         1147   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## size         1   -none-       numeric  
## lambda       1   -none-       numeric  
## subset    1147   -none-       numeric  
## model     1000   nnetarmodels list     
## nnetargs     1   -none-       list     
## fitted    1147   ts           numeric  
## residuals 1147   ts           numeric  
## lags        19   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         6   -none-       call
forecast_nn_weekly <- forecast(model_nn_weekly, PI=FALSE, h=110)

# Plot the forecasts for NN model
plot(forecast_nn_weekly, main = "PM 2.5 AQI Forecast - Neural Network",
         xlab = "Weekly", ylab = "PM 2.5 AQI")
lines(test_weekly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_nn_weekly <- mean(abs((test_weekly - forecast_nn_weekly$mean)/test_weekly))
mape_nn_weekly
## [1] 0.1910434
# Mean Absolute Error (MAE)
mae_nn_weekly <- mean(abs((test_weekly - forecast_nn_weekly$mean)))
mae_nn_weekly
## [1] 11.29234
# Mean Squared Error (MSE)
mse_nn_weekly <- mean((test_weekly - forecast_nn_weekly$mean)^2)
mse_nn_weekly
## [1] 195.8605
# Evaluate the residuals
checkresiduals(forecast_nn_weekly)

## 
##  Ljung-Box test
## 
## data:  Residuals from NNAR(18,1,5)[52]
## Q* = 107.35, df = 104.36, p-value = 0.401
## 
## Model df: 0.   Total lags used: 104.36

Step 5b: Extract values for time series and plot the series - Monthly

# Create monthly data
pm2_5_monthly <- apply.monthly(pm2_5_xts, mean)

pm2_5_monthly_ts <- ts(pm2_5_monthly["19990131/20230811"], start = c(1999,1), frequency = 12)
plot(pm2_5_monthly_ts)

kpss.test(pm2_5_monthly_ts)
## Warning in kpss.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  pm2_5_monthly_ts
## KPSS Level = 1.9774, Truncation lag parameter = 5, p-value = 0.01
adf.test(pm2_5_monthly_ts)
## Warning in adf.test(pm2_5_monthly_ts): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  pm2_5_monthly_ts
## Dickey-Fuller = -7.4466, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(pm2_5_monthly_ts, main = "ACF of Monthly Time Series")

pacf(pm2_5_monthly_ts, main = "PACF of Monthly Time Series")

Step 6b: Split data into train and test - Monthly

# Split the data into a training and test dataset
train_monthly <- window(pm2_5_monthly_ts, start = c(1999,1), end=c(2020,12))
test_monthly <- window(pm2_5_monthly_ts, start = c(2021,1))

Step 7b: Decompose the time series plot - Monthly

# Looking at spectral analysis
periodogram(train_monthly, log = "no", plot=TRUE, 
            ylab = "Periodogram",
            xlab = "Frequency",
            lwd=2, xlim = c(0, 0.2))

# There are two trends: 
#  1) A typical yearly (12 months) seasonal trend
#  2) A trend that is repeating every 8-9 years (100 months)
#  3) A typical half yearly (6 months) seasonal trend

# Overall, there is no mixed effect that normal seasonality model cannot capture.

# Decompose the series
plot(decompose(train_monthly, type="multiplicative"))

# Important Inferences
# 1) The PM2.5 AQI has been decreasing overall though there are rise every now and then.
#    However the trend is going down with time.
# 2) Winter months have a strong seasonal effect with Nov and Dec being the peak months
#    usually which likely could be due to cold temperatures causing pollutant to
#    not escape the lower atmosphere.
#    Link: https://www.accuweather.com/en/health-wellness/why-air-pollution-is-worse-in-winter/689434#:~:text=Cold%20air%20is%20denser%20and%20moves%20slower%20than%20warm%20air,rate%20than%20during%20the%20summer.
# 3) We can see a seasonal cycle of 12 months where the mean value of each month starts 
#    with a increasing trend in the beginning of the year and drops down towards 
#    the end of the year. We can see a seasonal effect with a cycle of 12 months.

# Understand the seasonality and remove it to see the trend
train_monthly_diff <- diff(train_monthly, lag = 12)
autoplot(train_monthly_diff, ylab="Train Seasonal Differencing (Monthly)")

# Let's look if the series is stationary?
kpss.test(train_monthly_diff)
## Warning in kpss.test(train_monthly_diff): p-value greater than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  train_monthly_diff
## KPSS Level = 0.14573, Truncation lag parameter = 5, p-value = 0.1
# The reported p-value is 0.1, which is > 0.05, and would suggest that we fail 
# to reject the null hypothesis of level stationarity (conclusion: stationary)

adf.test(train_monthly_diff)
## Warning in adf.test(train_monthly_diff): p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train_monthly_diff
## Dickey-Fuller = -5.1056, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
# The reported p-value of 0.01 (<0.05) so we do reject the null hypothesis that 
# the time series is non-stationary (conclusion: stationary)

acf(train_monthly_diff, main = "ACF of Seasonal Differencing Time Series")

pacf(train_monthly_diff, main = "PACF of Seasonal Differencing Time Series")

Step 8b: Benchmark using snaive model - Monthly

# Forecast the seasonal naïve for (2021-01 to 2023-07)
forecast_snaive_monthly <- snaive(train_monthly, h=31)
# Plot the forecasts for snaive model
plot(forecast_snaive_monthly, main = "PM 2.5 AQI Forecast - SNaive",
         xlab = "Month", ylab = "PM 2.5 AQI")
lines(test_monthly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_snaive_monthly <- mean(abs((test_monthly - forecast_snaive_monthly$mean)/test_monthly))
mape_snaive_monthly
## [1] 0.20778
# Mean Absolute Error (MAE)
mae_snaive_monthly <- mean(abs((test_monthly - forecast_snaive_monthly$mean)))
mae_snaive_monthly
## [1] 12.00976
# Mean Squared Error (MSE)
mse_snaive_monthly <- mean((test_monthly - forecast_snaive_monthly$mean)^2)
mse_snaive_monthly
## [1] 282.8901
# Evaluate the residuals
checkresiduals(forecast_snaive_monthly)

## 
##  Ljung-Box test
## 
## data:  Residuals from Seasonal naive method
## Q* = 82.387, df = 24, p-value = 2.523e-08
## 
## Model df: 0.   Total lags used: 24

Step 9b: Forecast using ETS - Monthly

# Build the exponential smoothing model
model_ets_monthly <- ets(train_monthly, lambda="auto")
summary(model_ets_monthly)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = train_monthly, lambda = "auto") 
## 
##   Box-Cox transformation: lambda= -0.0996 
## 
##   Smoothing parameters:
##     alpha = 0.238 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 3.5468 
##     s = 0.0787 0.0513 0.0374 0.0224 0.0067 0.0386
##            -0.0077 -0.0477 -0.1042 -0.0735 -0.0636 0.0616
## 
##   sigma:  0.0924
## 
##      AIC     AICc      BIC 
## 230.3216 232.2571 283.9609 
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 0.4199959 10.02138 7.654152 -1.045442 10.71358 0.7823768 0.1335483
forecast_ets_monthly <- forecast(model_ets_monthly, h=31)

# Plot the forecasts for ETS model
plot(forecast_ets_monthly, main = "PM 2.5 AQI Forecast - ETS",
         xlab = "Month", ylab = "PM 2.5 AQI")
lines(test_monthly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_ets_monthly <- mean(abs((test_monthly - forecast_ets_monthly$mean)/test_monthly))
mape_ets_monthly
## [1] 0.1962783
# Mean Absolute Error (MAE)
mae_ets_monthly <- mean(abs((test_monthly - forecast_ets_monthly$mean)))
mae_ets_monthly
## [1] 11.06968
# Mean Squared Error (MSE)
mse_ets_monthly <- mean((test_monthly - forecast_ets_monthly$mean)^2)
mse_ets_monthly
## [1] 163.3788
# Evaluate the residuals
checkresiduals(forecast_ets_monthly)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 63.91, df = 24, p-value = 1.757e-05
## 
## Model df: 0.   Total lags used: 24
Box.test(residuals(forecast_ets_monthly), lag = 12, type = "Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  residuals(forecast_ets_monthly)
## X-squared = 33.762, df = 12, p-value = 0.0007353

Step 10b: Forecast using Neural Network - Monthly

model_nn_monthly <- nnetar(train_monthly,
                           P=12,
                           size=5, 
                           decay=0.5,
                           repeats=1000, 
                           lambda="auto")
summary(model_nn_monthly)
##           Length Class        Mode     
## x          264   ts           numeric  
## m            1   -none-       numeric  
## p            1   -none-       numeric  
## P            1   -none-       numeric  
## scalex       2   -none-       list     
## size         1   -none-       numeric  
## lambda       1   -none-       numeric  
## subset     264   -none-       numeric  
## model     1000   nnetarmodels list     
## nnetargs     1   -none-       list     
## fitted     264   ts           numeric  
## residuals  264   ts           numeric  
## lags        16   -none-       numeric  
## series       1   -none-       character
## method       1   -none-       character
## call         7   -none-       call
forecast_nn_monthly <- forecast(model_nn_monthly, PI=FALSE, h=31)

# Plot the forecasts for NN model
plot(forecast_nn_monthly, main = "PM 2.5 AQI Forecast - Neural Network",
         xlab = "Month", ylab = "PM 2.5 AQI")
lines(test_monthly)

# Compare the forecast with the actual test data by calculating MAPE and MSE
# Mean Absolute Percentage Error (MAPE)
mape_nn_monthly <- mean(abs((test_monthly - forecast_nn_monthly$mean)/test_monthly))
mape_nn_monthly
## [1] 0.1453023
# Mean Absolute Error (MAE)
mae_nn_monthly <- mean(abs((test_monthly - forecast_nn_monthly$mean)))
mae_nn_monthly
## [1] 8.341272
# Mean Squared Error (MSE)
mse_nn_monthly <- mean((test_monthly - forecast_nn_monthly$mean)^2)
mse_nn_monthly
## [1] 109.5097
# Evaluate the residuals
checkresiduals(forecast_nn_monthly)

## 
##  Ljung-Box test
## 
## data:  Residuals from NNAR(4,12,5)[12]
## Q* = 14.63, df = 24, p-value = 0.9311
## 
## Model df: 0.   Total lags used: 24